[K] John K. Kruschke, Doing Bayesian Data Analysis, A Tutorial with R, JAGS, and STAN, 2015, Elsevier.
Source the utilities file from [K].
source("./DBDA2Eprograms/DBDA2E-utilities.R")
##
## *********************************************************************
## Kruschke, J. K. (2015). Doing Bayesian Data Analysis, Second Edition:
## A Tutorial with R, JAGS, and Stan. Academic Press / Elsevier.
## *********************************************************************
This project helps understanding comparison of groups in Gaussian model without predictors.
myDataFrame = read.csv("documents-MScA Bayesian Methods (32014)-MScA 32014 Lecture 8-TwoGroupIQ.csv")
y = as.numeric(myDataFrame[, "Score"])
x = as.numeric(as.factor(myDataFrame[, "Group"]))
(xLevels = levels(as.factor(myDataFrame[, "Group"])))
## [1] "Placebo" "Smart Drug"
Ntotal = length(y)
dataList = list(
y = y,
x = x,
Ntotal = Ntotal,
meanY = mean(y),
sdY = sd(y)
)
dataList
## $y
## [1] 102 107 92 101 110 68 119 106 99 103 90 93 79 89 137 119 126
## [18] 110 71 114 100 95 91 99 97 106 106 129 115 124 137 73 69 95
## [35] 102 116 111 134 102 110 139 112 122 84 129 112 127 106 113 109 208
## [52] 114 107 50 169 133 50 97 139 72 100 144 112 109 98 106 101 100
## [69] 111 117 104 106 89 84 88 94 78 108 102 95 99 90 116 97 107
## [86] 102 91 94 95 86 108 115 108 88 102 102 120 112 100 105 105 88
## [103] 82 111 96 92 109 91 92 123 61 59 105 184 82 138 99 93 93
## [120] 72
##
## $x
## [1] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [36] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 1 1 1 1 1 1 1
## [71] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [106] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
##
## $Ntotal
## [1] 120
##
## $meanY
## [1] 104.1333
##
## $sdY
## [1] 22.43532
Robust assumption of Student-t distribution.
data {
int<lower=1> Ntotal;
int x[Ntotal]; //Group variable
real y[Ntotal];
real meanY;
real sdY;
}
transformed data {
real unifLo;
real unifHi;
real normalSigma;
real expLambda; //Parameter of prior for nu
unifLo = sdY/100;
unifHi = sdY*100;
normalSigma = sdY*100;
expLambda = 1/30.0; //Setting value for expLambda
}
parameters {
real<lower=0> nu;
real mu[2]; //Making 2 groups
real<lower=0> sigma[2]; //Making 2 groups
}
model {
sigma ~ uniform(unifLo, unifHi); //Recall that sigma is a vector of 2 numbers
mu ~ normal(meanY, normalSigma); //Recall that mu is a vector of 2 numbers
nu~exponential(expLambda); //Exponential prior for nu
for (i in 1:Ntotal){
y[i] ~ student_t(nu, mu[x[i]], sigma[x[i]]); //Student_t distribution for y with nested group index
}
}
stanmodel with sampling()parameters = c("mu" , "sigma" , "nu") # The parameters to be monitored
adaptSteps = 500 # Number of steps to "tune" the samplers
burnInSteps = 1000
nChains = 4
thinSteps = 1
numSavedSteps <- 5000
# Get MC sample of posterior:
stanFitRobust <- sampling(
object = stanDsoRobust,
data = dataList,
pars = parameters,
# optional
chains = nChains,
cores = nChains,
iter = (ceiling(numSavedSteps / nChains) * thinSteps
+ burnInSteps),
warmup = burnInSteps,
init = "random",
# optional
thin = thinSteps
)
stanFitRobust
## Inference for Stan model: 9a9c40e6afa16673620cbf98f3528972.
## 4 chains, each with iter=2250; warmup=1000; thin=1;
## post-warmup draws per chain=1250, total post-warmup draws=5000.
##
## mean se_mean sd 2.5% 25% 50% 75% 97.5%
## mu[1] 99.28 0.02 1.76 95.83 98.14 99.31 100.43 102.75
## mu[2] 107.16 0.04 2.73 101.85 105.36 107.16 108.92 112.59
## sigma[1] 11.37 0.03 1.73 8.33 10.15 11.24 12.43 15.11
## sigma[2] 17.96 0.04 2.68 13.09 16.04 17.81 19.70 23.56
## nu 3.89 0.04 1.78 1.97 2.81 3.51 4.48 8.08
## lp__ -451.01 0.03 1.61 -454.98 -451.82 -450.68 -449.83 -448.88
## n_eff Rhat
## mu[1] 5288 1
## mu[2] 5667 1
## sigma[1] 3568 1
## sigma[2] 3560 1
## nu 2551 1
## lp__ 2225 1
##
## Samples were drawn using NUTS(diag_e) at Thu Dec 6 02:32:18 2018.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at
## convergence, Rhat=1).
plot(stanFitRobust)
rstan::traceplot(stanFitRobust, ncol = 1, inc_warmup = F)
pairs(stanFitRobust, pars = c("nu","mu","sigma"))
stan_diag(stanFitRobust,information = "sample",chain = 0)
Same fit results from the workshop, of course.
dis1 <- data.frame(
Mu = rstan::extract(stanFitRobust, pars = "mu[1]")$'mu[1]',
Sigma = rstan::extract(stanFitRobust, pars = "sigma[1]")$'sigma[1]'
)
dis2 <- data.frame(
Mu = rstan::extract(stanFitRobust, pars = "mu[2]")$'mu[2]',
Sigma = rstan::extract(stanFitRobust, pars = "sigma[2]")$'sigma[2]'
)
dis_df <- bind_rows("group_1" = dis1, "group_2" = dis2, .id = "Group") %>%
mutate(Group = factor(Group)) %>% as_tibble()
denDis1 <- density(dis1[, "Mu"])
denDis2 <- density(dis2[, "Mu"])
plot(denDis1, col = "blue", xlim = c(90, 120))
lines(denDis2, col = "green")
There is a positive correlation between Mu and Sigma, suggesting higher values for group_2.
ggcor(dis_df, Group)
We will use logistic regression to see if predictability of groups can be achieved with Mu and Sigma.
rec <- recipe(Group ~ Mu + Sigma, data = dis_df)
control <- trainControl(
method = "cv",
number = 10,
summaryFunction = twoClassSummary,
classProbs = TRUE,
savePredictions = TRUE
)
fit_logit <- train(
rec,
data = dis_df,
metric = "ROC",
method = "glm",
trControl = control,
family = binomial(link = "logit")
)
Summary of model shows that both Mu and Sigma are statistically significant for predicting Group.
summary(fit_logit)
##
## Call:
## NULL
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.4227 -0.0073 0.0000 0.0022 2.8478
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -195.22557 10.27965 -18.99 <2e-16 ***
## Mu 1.67937 0.09056 18.54 <2e-16 ***
## Sigma 1.59158 0.09432 16.88 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 13862.94 on 9999 degrees of freedom
## Residual deviance: 466.23 on 9997 degrees of freedom
## AIC: 472.23
##
## Number of Fisher Scoring iterations: 10
logit_predictions <- predict.train(fit_logit) %>%
data.frame(Group = .)
dis_df_logit <- dis_df %>% mutate(logit_prediction = factor(logit_predictions$Group, levels = c("group_1", "group_2")))
dis_df_logit %>%
ggplot(aes(Mu, Sigma, color = logit_prediction)) +
geom_point() +
facet_grid(
cols = vars(Group), labeller = labeller(Group = c(
group_1 = "Actual Group 1",
group_2 = "Actual Group 2"))
) +
scale_color_viridis_d(name = "Logit Predictions", labels = c("Group 1", "Group 2")) +
labs(title = "Logit Solution Classification Shows Demarcation",
subtitle = "A few mistakes near the edge")
compare_predictions_groups <- bind_rows(
"prediction" = dis_df_logit %>%
group_by(logit_prediction) %>%
summarise_if(is.numeric, mean) %>%
rename(Group = logit_prediction) %>%
ungroup(),
"data" = dis_df %>%
group_by(Group) %>%
summarise_if(is.numeric, mean), .id = "label"
) %>%
arrange(Mu)
| label | Group | Mu | Sigma |
|---|---|---|---|
| prediction | group_1 | 99.27736 | 11.36632 |
| data | group_1 | 99.28432 | 11.36688 |
| data | group_2 | 107.16002 | 17.95951 |
| prediction | group_2 | 107.17805 | 17.96932 |
compare_proportions <- bind_rows("logit" = logit_predictions %>%
janitor::tabyl(Group),
"data" = dis_df %>%
janitor::tabyl(Group), .id = "label") %>%
arrange(Group)
| label | Group | n | percent |
|---|---|---|---|
| logit | group_1 | 5007 | 0.5007 |
| data | group_1 | 5000 | 0.5000 |
| logit | group_2 | 4993 | 0.4993 |
| data | group_2 | 5000 | 0.5000 |
Next, k-means clustering will be used to see if we reach the same conclusion as logistic regression.
set.seed(1)
test_train_km <- initial_split(dis_df, prop = 79/125)
dis_df_train_km <- training(test_train_km)
dis_df_test_km <- testing(test_train_km)
dis_train_scaled_recipe <- dis_df_train_km %>%
recipe() %>%
step_rm(Group) %>%
steps::step_scale_min_max(all_numeric())
dis_train_scaled_prep <- prep(dis_train_scaled_recipe, training = dis_df_train_km, retain = TRUE)
## [1] "Mu" "Sigma"
dis_train_scaled <- juice(dis_train_scaled_prep)
dis_test_scaled <- bake(dis_train_scaled_prep, newdata = dis_df_test_km)
## [1] "Mu" "Sigma"
# 1. Run kmeans for 2-10 total clusters.
kclusters <- data.frame(k = 2:5) %>%
group_by(k) %>%
do(kclust = kmeans(dis_train_scaled, .$k, iter.max = 100, nstart = 100))
# 2. Save the VAF (Variance Inflation Factor) for each number of clusters
clusterings <- kclusters %>%
group_by(k) %>%
do(glance(.$kclust[[1]])) %>%
mutate(VAF = betweenss / totss)
# 3/4 Size of the clusters & Centroids of the clusters
clusters <- kclusters %>%
group_by(k) %>%
do(tidy(.$kclust[[1]])) %>%
mutate(size_prop = size / nrow(dis_train_scaled)) %>%
select(k, size, size_prop, cluster, everything())
assignments <- kclusters %>%
group_by(k) %>%
do(augment(.$kclust[[1]], dis_train_scaled))
There is a slight elbow at 3 k’s.
clusterings %>%
ggplot(aes(k, VAF)) +
geom_line() +
geom_point() +
scale_x_continuous(breaks = seq(min(clusterings$k), max(clusterings$k), 1)) +
scale_y_continuous(labels = scales::percent) +
labs(title = "Variance Accounted for Across k's",
y = "VAF")
However, interestingly, there is an even split for k = 2. This may suggest two distinct groups.
clusters %>%
ggplot(aes(factor(k), size_prop)) +
geom_col(aes(fill = size_prop), color = "white") +
scale_fill_viridis_c(name = "Cluster Size\nProportion", labels = scales::percent) +
scale_y_continuous(labels = scales::percent) +
labs(title = "Comparing Cluster Size Across k",
x = "Number of Clusters", y = "Cluster Size Proportion of Training Count") +
theme(legend.title = element_text(size = 10))
Fit k-means with 2 clusters:
kmclusters_k2 <- kmeans(dis_train_scaled, centers = 2, iter.max = 100, nstart = 1000)
assignment_k2 <- augment(kmclusters_k2, dis_train_scaled) %>%
mutate(Group = dis_df_train_km$Group,
Mu = dis_df_train_km$Mu,
Sigma = dis_df_train_km$Sigma,
cluster = factor(.cluster, levels = c(1, 2))) %>%
select(-.cluster)
K-means does a good job of separating the known groups:
assignment_k2 %>%
ggplot(aes(Mu, Sigma, color = cluster)) +
geom_point() +
facet_grid(
cols = vars(Group), labeller = labeller(Group = c(
group_1 = "Group 1",
group_2 = "Group 2"))
) +
scale_color_viridis_d(name = "Cluster") +
labs(title = "K-means Cluster Solution Shows Difference in Groups",
subtitle = "Demarcation is clear, with some mistakes near the edge")
compare_clusters_groups <- bind_rows(
centers_k2 <- assignment_k2 %>%
group_by(cluster) %>%
summarise_if(is.numeric, mean) %>%
rename(`cluster/group` = cluster),
dis_df %>%
group_by(Group) %>%
summarise_if(is.numeric, mean) %>%
rename(`cluster/group` = Group)
) %>%
arrange(Mu)
| cluster/group | Mu | Sigma |
|---|---|---|
| group_1 | 99.28432 | 11.36688 |
| 1 | 99.40510 | 11.35953 |
| group_2 | 107.16002 | 17.95951 |
| 2 | 107.26356 | 18.06951 |
Holdout results are stable relative to train, so the solution is credible. Groups appear to be different.
km_holdout_k2 <- kmeans(dis_test_scaled, centers = kmclusters_k2$centers)
assignment_k2_holdout <- augment(km_holdout_k2, dis_test_scaled) %>%
mutate(Group = dis_df_test_km$Group,
Mu = dis_df_test_km$Mu,
Sigma = dis_df_test_km$Sigma,
cluster = factor(.cluster, levels = c(1, 2))) %>%
select(-.cluster)
assignment_k2_holdout %>%
ggplot(aes(Mu, Sigma, color = cluster)) +
geom_point() +
facet_grid(
cols = vars(Group), labeller = labeller(Group = c(
group_1 = "Group 1",
group_2 = "Group 2"))
) +
scale_color_viridis_d(name = "Cluster") +
labs(title = "Stable Holdout")
compare_clusters_holdout_groups <- bind_rows(
centers_k2 <- assignment_k2_holdout %>%
group_by(cluster) %>%
summarise_if(is.numeric, mean) %>%
rename(`cluster/group` = cluster),
dis_df %>%
group_by(Group) %>%
summarise_if(is.numeric, mean) %>%
rename(`cluster/group` = Group)
) %>%
arrange(Mu)
| cluster/group | Mu | Sigma |
|---|---|---|
| group_1 | 99.28432 | 11.36688 |
| 1 | 99.31003 | 11.48111 |
| group_2 | 107.16002 | 17.95951 |
| 2 | 107.17198 | 18.02644 |
Read the data and create the data list.
df <- read.csv("DBDA2Eprograms/HierLinRegressData.csv")
dataList <- list(
Ntotal = length(df$Y),
y = df$Y,
x = df$X,
Ngroups = max(df$Subj),
group = df$Subj
)
data {
int<lower=1> Ntotal;
vector[Ntotal] y;
vector[Ntotal] x;
int<lower=1> Ngroups;
int<lower=1, upper=Ngroups> group[Ntotal];
}
transformed data {
real meanY;
real sdY;
vector[Ntotal] zy; // normalized y
real meanX;
real sdX;
vector[Ntotal] zx; // normalized x
meanY = mean(y);
sdY = sd(y);
zy = (y - meanY) / sdY;
meanX = mean(x);
sdX = sd(x);
zx = (x - meanX) / sdX;
}
parameters {
real<lower=0> zsigma;
real<lower=0> nu;
real zbeta0mu;
real zbeta1mu;
real<lower=0> zbeta0sigma;
real<lower=0> zbeta1sigma;
vector[Ngroups] zbeta0;
vector[Ngroups] zbeta1;
}
transformed parameters {
real<lower=0> sigma;
real beta0mu;
real beta1mu;
vector[Ngroups] beta0;
vector[Ngroups] beta1;
// Transform to original scale:
sigma = zsigma * sdY;
beta0mu = meanY + zbeta0mu * sdY - zbeta1mu * meanX * sdY / sdX;
beta1mu = zbeta1mu * sdY / sdX;
beta0 = meanY + zbeta0 * sdY - zbeta1 * meanX * sdY / sdX; // vectorized
beta1 = zbeta1 * sdY / sdX; // vectorized
}
model {
zsigma ~ uniform(0.001, 1000);
nu ~ exponential(1/30.0);
zbeta0mu ~ normal(0, 10.0^2);
zbeta1mu ~ normal(0, 10.0^2);
zbeta0sigma ~ uniform(0.001, 1000);
zbeta1sigma ~ uniform(0.001, 1000);
zbeta0 ~ normal(zbeta0mu, zbeta0sigma); // vectorized
zbeta1 ~ normal(zbeta1mu, zbeta1sigma); // vectorized
for (i in 1:Ntotal) {
zy[i] ~ student_t(1+nu, zbeta0[group[i]] + zbeta1[group[i]] * x[i], zsigma);
}
}
fit_robust <- sampling (
stanDsoRobustRegPanel,
data = dataList,
pars = c(
"nu",
"sigma",
"beta0mu",
"beta1mu",
"beta0",
"beta1",
"zbeta0sigma",
"zbeta1sigma"
),
iter = 5000,
chains = 4,
cores = 4
)
There were 1843 divergent transitions after warmup. The workshop tries to decrease stepsize and increase iter, adapt_delta, and max_treedepth.
fit_robust_adjust <- sampling (
stanDsoRobustRegPanel,
data = dataList,
pars = c(
"nu",
"sigma",
"beta0mu",
"beta1mu",
"beta0",
"beta1",
"zbeta0sigma",
"zbeta1sigma"
),
iter = 50000,
chains = 4,
cores = 4,
control = list(
adapt_delta = 0.999,
stepsize = 0.01,
max_treedepth = 15
)
)
However, many divergences still occurr:
plot(fit_robust_adjust)
rstan::traceplot(fit_robust_adjust)
pairs(fit_robust_adjust, pars = c("nu", "sigma", "beta0mu", "beta1mu"))
Adjusting prior for zsigma zbeta0sigma, and zbeta0sigma to cauchy:
data {
int<lower=1> Ntotal;
vector[Ntotal] y;
vector[Ntotal] x;
int<lower=1> Ngroups;
int<lower=1, upper=Ngroups> group[Ntotal];
}
transformed data {
real meanY;
real sdY;
vector[Ntotal] zy; // normalized y
real meanX;
real sdX;
vector[Ntotal] zx; // normalized x
meanY = mean(y);
sdY = sd(y);
zy = (y - meanY) / sdY;
meanX = mean(x);
sdX = sd(x);
zx = (x - meanX) / sdX;
}
parameters {
real<lower=0> zsigma;
real<lower=0> nu;
real zbeta0mu;
real zbeta1mu;
real<lower=0> zbeta0sigma;
real<lower=0> zbeta1sigma;
vector[Ngroups] zbeta0;
vector[Ngroups] zbeta1;
}
transformed parameters {
real<lower=0> sigma;
real beta0mu;
real beta1mu;
vector[Ngroups] beta0;
vector[Ngroups] beta1;
// Transform to original scale:
sigma = zsigma * sdY;
beta0mu = meanY + zbeta0mu * sdY - zbeta1mu * meanX * sdY / sdX;
beta1mu = zbeta1mu * sdY / sdX;
beta0 = meanY + zbeta0 * sdY - zbeta1 * meanX * sdY / sdX; // vectorized
beta1 = zbeta1 * sdY / sdX; // vectorized
}
model {
zsigma ~ cauchy(0, 2);
nu ~ exponential(1/30.0);
zbeta0mu ~ normal(0, 10.0^2);
zbeta1mu ~ normal(0, 10.0^2);
zbeta0sigma ~ cauchy(0, 2);
zbeta1sigma ~ cauchy(0, 2);
zbeta0 ~ normal(zbeta0mu, zbeta0sigma); // vectorized
zbeta1 ~ normal(zbeta1mu, zbeta1sigma); // vectorized
for (i in 1:Ntotal) {
zy[i] ~ student_t(1+nu, zbeta0[group[i]] + zbeta1[group[i]] * x[i], zsigma);
}
}
Additionally, decreasing stepsize and increasing warmup, iter, adapt_delta, and max_treedepth:
fit_robust_cauchy <- sampling(
stanDsoRobustRegPanel_cauchy,
data = dataList,
pars = c(
"nu",
"sigma",
"beta0mu",
"beta1mu",
"beta0",
"beta1",
"zbeta0sigma",
"zbeta1sigma"
),
iter = 5000,
chains = 4,
cores = 4,
warmup = 1500,
thin = 5,
control = list(
adapt_delta = 0.999999999,
stepsize = 0.0001,
max_treedepth = 20
)
)
pairs(fit_robust_cauchy, pars = c("nu", "sigma", "beta0mu", "beta1mu"))
There is an auto-correlation problem with the \(\beta0\mu\) and \(\beta1\mu\)
stan_ac(fit_robust_cauchy, pars = c("nu", "sigma", "beta0mu", "beta1mu"))
By adjusting priors to cauchy and increasing several parameters, we were able to dramatically reduce the divergences from ~1800 to 3. However, the drawback to this approach (compared to parameterizing the model better based on understanding of the data) is speed. The model takes much longer to fit with a small step size. And while increasing the warmup ameliorates the auto-correlation, it does not fix it completely.
life_exp as response fit Gaussian and robust non-hierarchical regression models using Bayesian approachstate.x77 from datasets used in multiple regression example in Statistical Analysis (MScA 31007):
dta <- as.data.frame(state.x77) %>%
janitor::clean_names()
| population | income | illiteracy | life_exp | murder | hs_grad | frost | area | |
|---|---|---|---|---|---|---|---|---|
| Alabama | 3615 | 3624 | 2.1 | 69.05 | 15.1 | 41.3 | 20 | 50708 |
| Alaska | 365 | 6315 | 1.5 | 69.31 | 11.3 | 66.7 | 152 | 566432 |
| Arizona | 2212 | 4530 | 1.8 | 70.55 | 7.8 | 58.1 | 15 | 113417 |
| Arkansas | 2110 | 3378 | 1.9 | 70.66 | 10.1 | 39.9 | 65 | 51945 |
| California | 21198 | 5114 | 1.1 | 71.71 | 10.3 | 62.6 | 20 | 156361 |
| Colorado | 2541 | 4884 | 0.7 | 72.06 | 6.8 | 63.9 | 166 | 103766 |
data_list <- list(
Ntotal = nrow(dta),
life_exp = dta$life_exp
)
Fitting intercept only models. First normal, then robust.
data {
int<lower=0> Ntotal;
vector<lower=0>[Ntotal] life_exp;
}
transformed data {
real mean_life_exp;
real sd_life_exp;
real unifLo;
real unifHi;
mean_life_exp = mean(life_exp);
sd_life_exp = sd(life_exp);
unifLo = sd_life_exp / 1000;
unifHi = sd_life_exp * 1000;
}
parameters {
real beta0;
real <lower=0> sigma;
}
model {
sigma ~ uniform(unifLo, unifHi);
life_exp ~ normal(beta0, sigma);
}
fit_normal_life_exp <- sampling(
stan_normal_model,
data = data_list,
pars = c(
"beta0",
"sigma"
),
iter = 5000,
chains = 4,
cores = 4,
)
Model estimates mean of life_exp:
show(fit_normal_life_exp)
## Inference for Stan model: e71bd361acb5cac7e89369ac91f60be6.
## 4 chains, each with iter=5000; warmup=2500; thin=1;
## post-warmup draws per chain=2500, total post-warmup draws=10000.
##
## mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff Rhat
## beta0 70.88 0.00 0.19 70.50 70.75 70.88 71.01 71.26 9112 1
## sigma 1.38 0.00 0.15 1.13 1.27 1.36 1.47 1.70 7474 1
## lp__ -39.95 0.02 1.02 -42.68 -40.36 -39.64 -39.22 -38.96 4574 1
##
## Samples were drawn using NUTS(diag_e) at Thu Dec 6 02:34:23 2018.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at
## convergence, Rhat=1).
Data mean: 70.8786
All model diagnostics looks good:
pairs(fit_normal_life_exp, pars = c("beta0", "sigma"))
stan_trace(fit_normal_life_exp, pars = c("beta0", "sigma"))
stan_ac(fit_normal_life_exp, pars = c("beta0", "sigma"))
stan_dens(fit_normal_life_exp, pars = c("beta0", "sigma"))
stan_diag(fit_normal_life_exp)
data {
int<lower=1> Ntotal;
vector<lower=0>[Ntotal] life_exp;
}
transformed data {
real expLambda;
real mean_life_exp;
real sd_life_exp;
real unifLo;
real unifHi;
expLambda = 1 / 30.0;
mean_life_exp = mean(life_exp);
sd_life_exp = sd(life_exp);
unifLo = sd_life_exp / 1000;
unifHi = sd_life_exp * 1000;
}
parameters {
real<lower=0> nu;
real beta0;
real<lower=0> sigma;
}
model {
nu ~ exponential(expLambda);
sigma ~ uniform(unifLo, unifHi);
life_exp ~ student_t(nu, beta0, sigma);
}
fit_robust_life_exp <- sampling(
stan_robust_model,
data = data_list,
pars = c(
"beta0",
"sigma",
"nu"
),
iter = 5000,
chains = 4,
cores = 4,
)
\(\nu\) is high, so probably Gaussian.
show(fit_robust_life_exp)
## Inference for Stan model: 4c1c3ec54062deb9ea15dce8e4d95bc3.
## 4 chains, each with iter=5000; warmup=2500; thin=1;
## post-warmup draws per chain=2500, total post-warmup draws=10000.
##
## mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff Rhat
## beta0 70.88 0.00 0.20 70.50 70.75 70.88 71.01 71.27 7151 1
## sigma 1.34 0.00 0.15 1.07 1.23 1.33 1.43 1.66 7362 1
## nu 40.82 0.34 31.46 5.98 18.28 32.19 54.40 124.21 8513 1
## lp__ -55.59 0.02 1.22 -58.80 -56.12 -55.28 -54.71 -54.21 4867 1
##
## Samples were drawn using NUTS(diag_e) at Thu Dec 6 02:35:13 2018.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at
## convergence, Rhat=1).
plot(fit_robust_life_exp)
Slight memory in \(\sigma\) and \(\beta\):
pairs(fit_robust_life_exp, pars = c("beta0", "sigma", "nu"))
stan_trace(fit_robust_life_exp, pars = c("beta0", "sigma", "nu"))
stan_ac(fit_robust_life_exp, pars = c("beta0", "sigma", "nu"))
stan_dens(fit_robust_life_exp, pars = c("beta0", "sigma", "nu"))
stan_diag(fit_robust_life_exp)
hs_gradmodel_df <- dta %>% select(hs_grad)
data_list <- list(
Ntotal = nrow(dta),
Nx = ncol(model_df),
x = as.matrix(model_df),
life_exp = dta$life_exp
)
data {
int<lower=1> Ntotal;
int<lower=1> Nx;
vector[Ntotal] life_exp;
matrix[Ntotal, Nx] x;
}
transformed data {
real expLambda;
real mean_life_exp;
real sd_life_exp;
real unifLo;
real unifHi;
mean_life_exp = mean(life_exp);
sd_life_exp = sd(life_exp);
unifLo = sd_life_exp / 1000;
unifHi = sd_life_exp * 1000;
}
parameters {
real beta0;
vector[Nx] xbeta;
real<lower=0> sigma;
}
model {
sigma ~ uniform(unifLo, unifHi);
life_exp ~ normal(beta0 + x * xbeta, sigma);
}
fit_normal_life_exp_predictors <- sampling(
stan_normal_predictors,
data = data_list,
pars = c(
"beta0",
"xbeta",
"sigma"
),
iter = 5000,
chains = 4,
cores = 4,
)
Initial takeaways from first model:
beta0 is now 65.75 (down from the mean of 70.88), so average life_exp is 65.75 when there are no high school grads
However, hs_grad is significant and increases life_expectancy for every one unit change in hs_grad
show(fit_normal_life_exp_predictors)
## Inference for Stan model: 1f82668a852b76fbe0953b6e6582a139.
## 4 chains, each with iter=5000; warmup=2500; thin=1;
## post-warmup draws per chain=2500, total post-warmup draws=10000.
##
## mean se_mean sd 2.5% 25% 50% 75% 97.5% n_eff Rhat
## beta0 65.73 0.02 1.08 63.59 65.01 65.72 66.45 67.84 3199 1
## xbeta[1] 0.10 0.00 0.02 0.06 0.08 0.10 0.11 0.14 3193 1
## sigma 1.13 0.00 0.12 0.92 1.05 1.12 1.20 1.39 3747 1
## lp__ -30.34 0.02 1.26 -33.57 -30.92 -30.01 -29.42 -28.90 3226 1
##
## Samples were drawn using NUTS(diag_e) at Thu Dec 6 02:36:05 2018.
## For each parameter, n_eff is a crude measure of effective sample size,
## and Rhat is the potential scale reduction factor on split chains (at
## convergence, Rhat=1).
plot(fit_normal_life_exp_predictors, pars = "xbeta")
There is memory in \(\sigma\) and betas
beta0 and beta1 show a strong negative relationship
pairs(fit_normal_life_exp_predictors, pars = c("beta0", "xbeta", "sigma"))
stan_trace(fit_normal_life_exp_predictors, pars = c("beta0", "xbeta", "sigma"))
stan_ac(fit_normal_life_exp_predictors, pars = c("beta0", "xbeta", "sigma"))
stan_dens(fit_normal_life_exp_predictors, pars = c("beta0", "xbeta", "sigma"))
stan_diag(fit_normal_life_exp_predictors)
Question 1 data was proven distinct between classes based on logistic and k-means; there were clear distinctions between groups with both methods
Question 2 was unsolvable; however, we were able to significantly reduce divergences at the cost of painfully slower computation
No issues with any model diagnostics for all models in question 3
Intercept models simply estimated the mean from the data (no surprise)
Robust model was unnecessary due to high \(\nu\), so Gaussian will likely be fine
hs_grad showed to be significant with a positive loading when viewing HDI >>>>>>> 19422b4d6656c480a25558b9333f444645d2a789